library(tidyverse)
## ── Attaching packages ───────────────────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 3.2.1     ✔ purrr   0.3.2
## ✔ tibble  2.1.3     ✔ dplyr   0.8.3
## ✔ tidyr   1.0.0     ✔ stringr 1.4.0
## ✔ readr   1.3.1     ✔ forcats 0.4.0
## ── Conflicts ──────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
library(rvest)
## Loading required package: xml2
## 
## Attaching package: 'rvest'
## The following object is masked from 'package:purrr':
## 
##     pluck
## The following object is masked from 'package:readr':
## 
##     guess_encoding
library(readr)
library(viridis)
## Loading required package: viridisLite
library(leaflet)
knitr::opts_chunk$set(
    echo = TRUE,
    warning = FALSE,
    fig.width = 8, 
  fig.height = 6,
  out.width = "90%"
)
options(
  ggplot2.continuous.colour = "viridis",
  ggplot2.continuous.fill = "viridis"
)
scale_colour_discrete = scale_colour_viridis_d
scale_fill_discrete = scale_fill_viridis_d
theme_set(theme_minimal() + theme(legend.position = "bottom"))
data_2018 = 
  read_csv("./data/2018data.csv") %>% 
  janitor::clean_names() 
## Parsed with column specification:
## cols(
##   .default = col_character(),
##   TIME = col_time(format = ""),
##   `ZIP CODE` = col_double(),
##   LATITUDE = col_double(),
##   LONGITUDE = col_double(),
##   `NUMBER OF PERSONS INJURED` = col_double(),
##   `NUMBER OF PERSONS KILLED` = col_double(),
##   `NUMBER OF PEDESTRIANS INJURED` = col_double(),
##   `NUMBER OF PEDESTRIANS KILLED` = col_double(),
##   `NUMBER OF CYCLIST INJURED` = col_double(),
##   `NUMBER OF CYCLIST KILLED` = col_double(),
##   `NUMBER OF MOTORIST INJURED` = col_double(),
##   `NUMBER OF MOTORIST KILLED` = col_double(),
##   COLLISION_ID = col_double()
## )
## See spec(...) for full column specifications.
newnames = colnames(data_2018) %>% 
  str_replace("number_of_","") 
names(data_2018) = newnames

tidy_data = 
data_2018 %>% 
  mutate(
    date_complete = date 
  ) %>% 
  separate(date, into = c("month", "day", "year"), sep = "/") %>% 
  separate(time, into = c("hour", "minute"), sep = ":") %>%
  select(-zip_code, -location, -on_street_name, -cross_street_name, -off_street_name,-collision_id,-year) %>% 
  rename("vehicle_type" = "vehicle_type_code_1") %>% 
  mutate( day = as.numeric(day),
          month = as.numeric(month),
          hour = as.numeric(hour),
          minute = as.numeric(minute),
          latitude = replace_na(latitude,0),
          vehicle_type = str_to_lower(vehicle_type)
  ) %>%
  filter( latitude != 0)

Vehicle type

vehicle_type_data = 
  tidy_data %>% 
  mutate(
    vehicle_type = replace(vehicle_type,str_detect(vehicle_type,"truck"),"truck"),
    vehicle_type = replace(vehicle_type,str_detect(vehicle_type,"sport utility"),"sport utility vehicle")
    ) %>% 
  filter( vehicle_type %in% c("taxi","passenger vehicle","truck","sport utility vehicle")) %>% 
  group_by(vehicle_type,hour) %>% 
  summarize(
    n = n()
  )

vehicle_type_data %>% 
  plot_ly(
    x = ~hour, y = ~n, color = ~vehicle_type, type = "scatter", mode = "line") %>% 
  layout(
    title = "Collisions of Day for Different Vehicles",
    xaxis = list(title = "Hour of Day"),
    yaxis = list(title = "Collisions")
    )

Top 8 Collision Reasons

reason_data = 
  tidy_data %>%
  group_by(contributing_factor_vehicle_1) %>%
  summarize(n = n()) %>% 
  arrange(desc(n)) %>% 
  head(10)

reason_data %>% 
  plot_ly(x = ~reorder(contributing_factor_vehicle_1,desc(n)), y = ~n, color = ~contributing_factor_vehicle_1 ,type = "bar") %>% 
  layout(
    title = "The Number of Items Ordered in Each Aisle",
    xaxis = list(title = "Different Reasons"),
    yaxis = list(title = "Count")
    )

Mapping

data_2018 = tidy_data

data_2018 = rename(data_2018, long = latitude, lat = longitude)

pal <- colorNumeric(
  palette = "viridis",
  domain = data_2018$persons_injured)

data_2018 %>% 
  filter(!(lat < "-70" | lat >= "-75")) %>% 
  filter(persons_injured > 2) %>% 
    mutate(
    label = str_c("<b>vehicle type: ", vehicle_type, "</b><br>Month: ", month , sep = "") ) %>% 
  sample_n(2000) %>% 
  leaflet() %>% 
  addTiles() %>%
  addProviderTiles(providers$CartoDB.Positron) %>% 
  addLegend("bottomright", pal = pal, values = ~persons_injured,
    title = "Persons Injured",
    opacity = 1
  ) %>% 
  addCircleMarkers(
    ~lat, ~long,
    color = ~pal(persons_injured),
    radius = 0.5,
    popup = ~ label) 
data_2018 %>% 
  group_by(borough) %>% 
  summarise(n())
## # A tibble: 6 x 2
##   borough       `n()`
##   <chr>         <int>
## 1 BRONX         22121
## 2 BROOKLYN      46314
## 3 MANHATTAN     29728
## 4 QUEENS        40400
## 5 STATEN ISLAND  5988
## 6 <NA>          71583
data_2018_seperate = tidy_data 

data_kill_injured = data_2018_seperate %>% 
  select(month, persons_injured,persons_killed, pedestrians_injured, pedestrians_killed, cyclist_injured, cyclist_killed, motorist_injured, motorist_killed)

data_kill_injured$injured  = apply(data_kill_injured[,c(2,4,6,8)],1,sum,na.rm=T)
data_kill_injured$killed = apply(data_kill_injured[,c(3,5,7,9)],1,sum,na.rm=T)

data_kill_injured = 
  data_kill_injured %>% 
  group_by(month) %>% 
  summarise(
    sum_injured = sum(injured),
    sum_killed = sum(killed)
  ) %>%
  ungroup() 
  
data_kill_injured = data_kill_injured %>% 
  pivot_longer(
    sum_injured:sum_killed,
    names_to = "type",
    values_to = "number"
  ) 
plot_kill_injured = data_kill_injured %>% 
  ggplot(aes(x = month, y = number, color = type))+
           geom_point()+
           geom_line()+
  scale_x_continuous(breaks=seq(1, 12, 1),
                     labels = c("Jan", "Feb", "Mar", "Apr", "May",
                                "Jun","Jul", "Aug", "Sep", "Oct",
                                "Nov", "Dec"))+
  scale_y_continuous(breaks = seq(0,11000,1000))+
  labs(
   title = "Trend of People being Injured or Killed through the Year")+
  theme(axis.title = element_text(size=14,face="bold"),
        plot.title = element_text(hjust = 0.5,color ="Blue"))


plot_kill_injured = ggplotly(plot_kill_injured)
plot_kill_injured
data_kill_injured_day = data_2018_seperate %>% 
  select(date_complete, persons_injured,persons_killed, pedestrians_injured, pedestrians_killed, cyclist_injured, cyclist_killed, motorist_injured, motorist_killed)

data_kill_injured_day$injured  = apply(data_kill_injured_day[,c(2,4,6,8)],1,sum,na.rm=T)
data_kill_injured_day$killed = apply(data_kill_injured_day[,c(3,5,7,9)],1,sum,na.rm=T)

data_kill_injured_day = 
  data_kill_injured_day%>% 
  group_by(date_complete) %>% 
  summarise(
    sum_injured = sum(injured),
    sum_killed = sum(killed)
  ) %>%
  ungroup() 
  
data_kill_injured_day = data_kill_injured_day %>% 
  pivot_longer(
    sum_injured:sum_killed,
    names_to = "type",
    values_to = "number"
  ) %>% 
  mutate(
    day = rep(1:365, each = 2),
    month = rep(1:12,c(62,56,62,60,62,60,62,62,60,62,60,62))
  )

plot_kill_injured_day = data_kill_injured_day %>% 
  ggplot(aes(x = date_complete, y = number, group = type, color = type))+
  geom_line()+
  theme(axis.text.x = element_blank(),
        axis.title = element_text(size=14,face="bold"),
        plot.title = element_text(hjust = 0.5,color ="Blue"))+
  labs(
   title = "Trend of People being Injured or Killed through the Day",
   x = "Day of the Year")
  
plot_kill_injured_day

plot_kill_injured_day = ggplotly(plot_kill_injured_day)
plot_kill_injured_day

```